In [1]:
### ~~~~~ Libraries ~~~~~ ###

import os # makedir
import glob # use UNIX-style patterns to find files in directory

import numpy as np
import numpy.matlib as npm

import scipy
import scipy.stats # for multivariate_normal
import scipy.io # Need this to load .mat files

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
import matplotlib.mlab as mlab # Numerical python functions written for compatability with MATLAB commands with the same names.Numerical python functions written for compatability with MATLAB commands with the same names.
from cycler import cycler
In [2]:
### ~~~~~ Data Generation ~~~~~ ###

# Data set that Keene asked us to generate in class
def getKeeneData(N1, N2):
    
    # Total number of observations
    N = N1+N2;

    # Parameters for Gaussian (these are unknown and we're to estimate them)
    mu1 = np.array([2,2])
    mu2 = np.array([-2,-2])
    cov = np.array([[1,0],[0,1]]) # or do identity

    # Generate data samples from Gaussian
    # Samples are nObservations-by-nFeatures
    X1 = np.random.multivariate_normal(mu1,cov,N1)
    X2 = np.random.multivariate_normal(mu2,cov,N2)                               
    X = np.concatenate((X1,X2),axis=0)
        # concatenate data into one matrix
    Y = np.concatenate(( np.ones((X1.shape[0],1)), \
                        np.zeros((X2.shape[0],1)) ),axis=0)
        # generate column vector for corresponding values (we want to predict these)

    # Convert data to matrices for simpler Python operations
    X = np.asmatrix(X)
    Y = np.asmatrix(Y)
    
    return (X,Y)

def getSpecialData():
    
    mat = scipy.io.loadmat('data.mat')
    
    X_circles = np.asmatrix(mat['circles'][0][0][0])
    Y_circles = np.asmatrix(mat['circles'][0][0][1])

    X_unimodal = np.asmatrix(mat['unimodal'][0][0][0])
    Y_unimodal = np.asmatrix(mat['unimodal'][0][0][1])
    
    X_spiral = np.asmatrix(mat['spiral'][0][0][0])
    Y_spiral = np.asmatrix(mat['spiral'][0][0][1])
    
    X_bimodal = np.asmatrix(mat['bimodal'][0][0][0])
    Y_bimodal = np.asmatrix(mat['bimodal'][0][0][1])
    
    return (X_circles,Y_circles,X_unimodal,Y_unimodal,X_spiral,Y_spiral,X_bimodal,Y_bimodal)

# Generate randomly clustered data, 2 features
# Specify K (number of clusters), sizes of clusters as a list, r (distance between clusters),
def generateClusters(K, sizes, r=3):
    
    if K > len(sizes):
        print('Specified K (%d) exceeds size of list (%d).' % (K, len(sizes)))
        return
    
    # Parameters to generate data with using for Gaussian
    mu = []
    for k in range(K):
        # Specify position of means in polar coordinates first, then go back to Cartesian
        # This is so we can evenly divide clusters
        theta = 2*np.pi*k/float(K)
        x = r*np.cos(theta)
        y = r*np.sin(theta)
        mu.append(np.array([x,y]))
    cov = np.eye(2) # just use identity for all covariances

    # Generate data samples from Gaussian
    # Samples are nObservations-by-nFeatures
    X = np.empty((0,2)) # zero rows so that there are no initial values
    for k in range(K):       
        X_k = np.random.multivariate_normal(mu[k],cov,sizes[k])
        X = np.concatenate((X,X_k),axis=0) # concatenate rows together
 
    # Convert data to matrices for simpler Python operations
    X = np.asmatrix(X)

    return X

# Generate randomly clustered data, 1 feature
def generate1DClusters(K, sizes):
    
    if K > len(sizes):
        print('Specified K %d exceeds size of list %d' % (K, len(sizes)))
        return
    
    # Parameters to generate data with using for Gaussian
    mu = []
    for k in range(K):
        # Distribute clusters so that means are evenly separated from -x_range to x_range
        # Draw it visually and you'll see that this is the formula to do this
        x_range = 50
        x = -x_range + x_range/float(K) + 2*k*x_range/float(K)
        mu.append(x)
    sigma = 1 # just use 1 for variance

    # Generate data samples from Gaussian
    # Samples are nObservations-by-nFeatures
    X = []
    for k in range(K):       
        X_k = np.random.normal(mu[k],sigma,sizes[k])
        X.append(X_k)
 
    # Convert data to matrices for simpler Python operations
    # First unravel list to make all data in one row
    # Then convert to matrix and transpose to have N rows
    X = np.asmatrix(np.ravel(X)).T

    return X
In [3]:
### ~~~~~ Plotting Functions ~~~~~ ###

# Plot original data with mean and standard deviation contours around mean
# Only works for 2 features
def plotContours(X, mu, cov, frame):
    
    # Don't plot if not 2 features
    M = X.shape[1]
    if M != 2:
        return
        
    # Set figure size and get plot axes
    figsize = (8,6)
    plt.figure(figsize=figsize)
    axes = plt.gca()

    # Plot data
    plt.scatter(X[:,0],X[:,1],color='r')
    
    # Plot each mean
    K = len(mu)
    for k in range(K):
        plt.scatter(mu[k][0],mu[k][1],color='k')
    
    # One contour plot for each mean
    n1 = 100
    n2 = 100
    x1 = np.linspace(-10,10,n1)
    x2 = np.linspace(-10,10,n2)
    X1, X2 = np.meshgrid(x1,x2)
    for k in range(len(mu)):
        # Combine X1 and X2 into one matrix with two features (unroll them first) before putting in normal
        # When combining the two raveled matrices, don't use concatenate, because ravel makes it so it's 1D, so result of concat is 1D
        #print np.array([np.ravel(X1), np.ravel(X2)])
        #print np.concatenate((np.ravel(X1),np.ravel(X2)),axis=0)
        normpdf = scipy.stats.multivariate_normal(mean=np.ravel(mu[k]),cov=cov[k])
        temp = np.array([np.ravel(X1), np.ravel(X2)])
        z = []
        for i in range(n1*n2):
            z.append(normpdf.pdf(np.ravel(temp[:,i])))
        # Reshape z to have the same dimensions as meshgrid for plot
        # NOTE: reshape can be applied to lists and still return an ndarray
        z = np.reshape(z,(n1,n2))
        plt.contour(x1,x2,z)
    
    # Plot labels and limits
    plt.title('Observations with Means from EM (Contour Plot)')
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.xlim([-10,10])
    plt.ylim([-10,10])
    
    # Show and save plot
    dirname = "anim/M=%d/K=%d" % (M,K) 
    figname = "EM_M=%d_K=%d_contour_%03d.png" % (M,K,frame)
    plt.savefig(dirname + '/' + figname)
    plt.show()
    
# Plot original data with colors corresponding to responsibility values
# Also plot the contour for one standard deviation away from mean
# Only works for 2 features
# Only works for 2 <= K <= 3
def plotColors(X, mu, cov, gamma, frame):
    
    # Don't plot if not 2 features
    M = X.shape[1]
    if M != 2:
        return
    
    # Don't plot if K isn't 2 or 3
    K = len(mu)
    if K > 3 or K < 2:
        return
    
    # Set figure size and get plot axes
    figsize = (8,6)
    plt.figure(figsize=figsize)
    axes = plt.gca()

    # Plot color-coded data
    N = X.shape[0]
    for n in range(N):
        r = gamma[0][n,0] # red is k = 1
        g = gamma[1][n,0] # green is k = 2
        if K == 2:
            b = 0 # fix one of the RGB values if only 2 clusters
        elif K == 3:
            b = gamma[2][n,0] # blue is k = 3
        plt.scatter(X[n,0],X[n,1],color=(r,g,b))
        
    # One contour plot for each mean
    K = len(mu)
    n1 = 100
    n2 = 100
    x1 = np.linspace(-10,10,n1)
    x2 = np.linspace(-10,10,n2)
    X1, X2 = np.meshgrid(x1,x2)
    for k in range(K):
        # Combine X1 and X2 into one matrix with two features (unroll them first) before putting in normal
        # When combining the two raveled matrices, don't use concatenate, because ravel makes it so it's 1D, so result of concat is 1D
        #print np.array([np.ravel(X1), np.ravel(X2)])
        #print np.concatenate((np.ravel(X1),np.ravel(X2)),axis=0)
        normpdf = scipy.stats.multivariate_normal(mean=np.ravel(mu[k]),cov=cov[k])
        temp = np.array([np.ravel(X1), np.ravel(X2)])
        z = []
        for i in range(n1*n2):
            z.append(normpdf.pdf(np.ravel(temp[:,i])))
        # Reshape z to have the same dimensions as meshgrid for plot
        # NOTE: reshape can be applied to lists and still return an ndarray
        z = np.reshape(z,(n1,n2))
        # Only want to plot points that are one std. dev away
        rgb = 'r' if (k == 0) else ('g' if (k == 1) else 'b');
        plt.contour(x1,x2,z,1,colors=rgb)
    
    # Plot each mean
    for k in range(K):
        plt.scatter(mu[k][0],mu[k][1],color='k')
    
    # Plot labels
    plt.title('Observations with Means from EM (Color Plot)')
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.xlim([-10,10])
    plt.ylim([-10,10])
    
    # Show and save plot
    dirname = "anim/M=%d/K=%d" % (M,K) 
    figname = "EM_M=%d_K=%d_color_%03d.png" % (M,K,frame)
    plt.savefig(dirname + '/' + figname)
    plt.show()

# Only works for one features
def plotHistogram(X, mu, var, histogram_scale, frame):
    
    # Don't plot if not 1 feature
    M = X.shape[1]
    if M != 1:
        return
        
    # Set figure size and get plot axes
    figsize = (8,6)
    plt.figure(figsize=figsize)
    axes = plt.gca()
    
    # Plot histogram for observations
    plt.hist(X,bins=histogram_scale*5,color='y')
    
    # Plot distribution resulting from parameters from current EM iteration
    # Plot distribution for each mean over some range x (defined to span s std.dev's from mean)
    K = len(mu)
    for k in range(K):
        n = 100 # number of points in range
        s = 3 # number of std. devs to span
        sigma_k = np.sqrt(var[k])
        # Convert mu[k], sigma_k to scalars because they are single elements nested within arrays, will cause issues
        x = np.linspace(np.asscalar(mu[k]-s*sigma_k),np.asscalar(mu[k]+s*sigma_k),n)
        #x = np.linspace(-50,50,n) # hard code range for now
        # Scale up normals (scale factor depends on size of clusters)
        if M == 3:
            rgb = 'r' if (k == 0) else ('g' if (k == 1) else 'b');
            plt.plot(x,histogram_scale*mlab.normpdf(x,np.asscalar(mu[k]),np.asscalar(sigma_k)),color=rgb)    
        else:
            plt.plot(x,histogram_scale*mlab.normpdf(x,np.asscalar(mu[k]),np.asscalar(sigma_k)))   

    # Plot labels
    plt.title('Histogram Plot with EM Distribution')
    plt.xlabel('x (1-feature observations)')
    plt.ylabel('Distribution')
    plt.xlim([-60,60])
    plt.ylim([0,histogram_scale*0.5]) # use 50% of peak distribution size for now
    
    # Show and save plot
    dirname = "anim/M=%d/K=%d" % (M,K) 
    figname = "EM_M=%d_K=%d_histogram_%03d.png" % (M,K,frame)
    plt.savefig(dirname + '/' + figname)
    plt.show()
    
In [4]:
### ~~~~~ Expectation Maximization ~~~~~ ###

# Perform Expectation-Maximization on a given observation set X
# X is an N-by-M matrix
# Specify K, the number of clusters
# Specify histogram_scale, which specifies the number of bins and scaling factor for normal distribution
def EM(X, K=2, histogram_scale=20):
    
    # Can't have 0 or less clusters, terminate algo
    if K <= 0:
        return
    
    # N observations, M features
    N = X.shape[0]
    M = X.shape[1]
    
    print "N:", N, "M:", M
    print
    print "Before EM:"
    
    # First make new directory for image files, if it doesn't exist
    dirname = "anim/M=%d/K=%d" % (M,K)
    try:
        os.makedirs(dirname)
        print ('Making new directory %s...') % (dirname)
    except:
        print ('Directory %s already exists, clearing files...') % (dirname)
        # If directory already exists, clear all files in there
        file_list = glob.glob('anim/M={}/K={}/*.png'.format(M,K))
        for f in file_list:
            os.remove(f)
    
    ### Step 1: Initialize parameters and log-likelihood ###
    
    # Initialize containers for parameters
    # mu_k is M-by-1 => mu is k-by-M-by-1
    # cov_k is M-by-M => cov is k-by-M-by-M
    # pi_k is 1-by-1 => pi is k-by-1-by-1
    # Just use k-dim lists
    mu = [np.asmatrix(np.random.rand(M,1)) for k in range(K)]
        # random mean
        # NOTE: mu's can't be the same for all K because then normpdf will be the same for all K
        # Then, nothing will change because responsibility is always equal to pi (denom cancels out with numer, leaving pi)
    cov = [npm.eye(M) for k in range(K)]
        # all covariances are identity for now
        # results in NaN values for normpdf for M = 1, K >= 4, so will change it for M = 1 only
    if M == 1:
        cov = [K for k in range(K)]
        # cov=2 works for K=5, apparently need higher covariances for higher K?
    pi = [float(1)/K for k in range(K)]
        # make pi's all the same for now, bad thing about this is that we can't use pi's as convergence check
        # NOTE: can't initialize to 0 or denom will be 0 when calculating responsibility
    
    # Initialize log-likelihood
    log_likelihood = 0
    
    # Initialize responsibilities to zero, N responsibilities for each k
    # gamma_k is N-by-1
    gamma = [npm.zeros((N,1)) for k in range(K)]
    
    # Initialize Nk's
    Nk = [0 for k in range(K)]
    
    # Make plots for initial parameters, also creates frame 0 of gif
    if M == 2:
        if K > 3:
            plotContours(X,mu,cov,0)
        elif K > 1:
            plotColors(X,mu,cov,gamma,0)
    elif M == 1:
        plotHistogram(X,mu,cov,histogram_scale,0)
    print
    
    ### Repeat steps 2 to 4 until convergence ###
    # Cap at a certain number of iterations so you don't potential get an infinite loop
    iterations = 100
    for iteration in range(iterations):
        
        # NOTE: Need to reset values to 0 before doing ANY summation
        
        # Compute the frozen multivariate normal for current mu, cov before doing anything else
        # This may save some computation time
        # Use np.ravel to make mean array-like (doesn't deal with column matrices)
        normpdf = [0 for k in range(K)]
        for k in range(K):
            normpdf[k] = scipy.stats.multivariate_normal(mean=np.ravel(mu[k]),cov=cov[k])
    
        ### Step 2: E step, update responsibilities ###
        for k in range(K):
            for n in range(N):

                # Compute denominator of responsibility
                denom = 0
                for j in range(K):
                    denom += pi[j]*normpdf[j].pdf(np.ravel(X[n,:]))
                                
                # Needed to call multivariate_normal without x as input to freeze the distribution
                # Now get the associated probability using pdf, use np.ravel again to avoid issues with input
                gamma[k][n,0] = pi[k]*normpdf[k].pdf(np.ravel(X[n,:]))/denom
                
        # Need to update Nk's after responsibilities are calculated
        for k in range(K):
            Nk[k] = 0
            for n in range(N):
                Nk[k] += gamma[k][n,0]
        
        ### Step 3: M step, update parameters using new responsibilities ###
        # X[n,:] is 1-by-M, so transpose for M-by-1

        # Update mu's
        mu_old = list(mu) # need to make a copy using list(), assignment copies the list reference, not the list itself
        for k in range(K):
            mu[k] = npm.zeros((M,1))
            for n in range(N):
                # Need to index into 2nd dim as well to get scalar (1x1 times Mx1 doesn't work)
                mu[k] += gamma[k][n,0]*X[n,:].T/float(Nk[k])
        
        # Update cov's, need new mu_k's to update cov_k's
        cov_old = list(cov)
        for k in range(K):
            cov[k] = npm.zeros((M,M))
            for n in range(N):
                cov[k] += gamma[k][n,0]*(X[n,:].T-mu[k])*(X[n,:].T-mu[k]).T/float(Nk[k])
         
        # Update pi's
        pi_old = list(pi)
        for k in range(K):
            pi[k] = Nk[k]/float(N)
                    
        # Step 4: Evaluate log likelihood
        log_likelihood_old = log_likelihood
        log_likelihood = 0
        for n in range(N):
            temp = 0;
            for k in range(K):
                temp += pi[k]*normpdf[k].pdf(np.ravel(X[n,:]))
            log_likelihood += np.log(temp)
        
        # Print stats
        print ("Iteration %d:" % (iteration+1))
        #print "Responsibility:", gamma
        #print "Nk:", Nk
        #print "mu:", mu
        #print "cov:", cov
        #print "pi:", pi
        print "Log likelihood:", log_likelihood
        
        # Plot result of EM after each iteration, also generates frame corresponding to each iteration
        if M == 2:
            if K > 3:
                plotContours(X,mu,cov,iteration+1) # Frame iteration+1, since iteration starts at 0
            elif K > 1:
                plotColors(X,mu,cov,gamma,iteration+1)
        elif M == 1:
            plotHistogram(X,mu,cov,histogram_scale,iteration+1)
        print
        
        ### Convergence test, break EM if either parameters or log likelihood converge ###
        
        if abs(log_likelihood-log_likelihood_old) < 0.001:
            print ("K-Means converged on iteration %d, log likelihood converged (old=%.5f,new=%.5f,diff=%.5f)." \
                       % (iteration+1,log_likelihood_old,log_likelihood,abs(log_likelihood-log_likelihood_old)))
            break
            
        # Commenting this out for now for smoother looking graphs
        """
        break_EM = False
        for k in range(K):
            if np.linalg.norm(mu[k]-mu_old[k],ord=2) < 0.001 or \
                np.linalg.norm(cov[k]-cov_old[k],ord=2) < 0.001:# or \
                #abs(pi[k]-pi[k]) < 0.001: # ignore this for now because pi is initialized to be the same
                    print ("K-Means converged on iteration %d, parameters converged.\nDisplaying log likelihoods: (old=%.5f,new=%.5f,diff=%.5f)" \
                           % (iteration+1,log_likelihood_old,log_likelihood,abs(log_likelihood-log_likelihood_old)))
                    break_EM = True
                    break

        if break_EM:
            break
        """
        
        # Uncomment for only one iteration
        #if iteration == 1:
        #    break
        
    frames = iteration+1
    return gamma, mu, cov, pi, log_likelihood, frames
In [5]:
### ~~~~~ Animation and File Cleanup ~~~~~ ###

# Turn saved plot images into animation
# Specify M, K just for naming purposes
# Specify number of frames so that the last frame can be delayed longer
# Specify frame rate
def generateAnimation(M, K, delay):
    
    # Only plotting for K >= 1, M = 1,2
    if K <= 0 or (M != 1 and M != 2):
        return
    
    dirname = "anim/M=%d/K=%d"% (M,K)
        # Get name of directory to plot
    
    file_list = glob.glob(dirname + '/' + '*.png')
        # Get all the pngs in the current directory
    list.sort(file_list, key=lambda x: int(x.split('_')[4].split('.png')[0]))
        # Sort the images by number, which is the 5th element delimited by '_', and exclude '.png'
    
    with open('image_list.txt', 'w') as file:         
        for item in file_list:
            file.write("%s\n" % item)
                # Write image names to text file

    plot_type = 'histogram' if M == 1 else ('contour' if K > 3 else 'color')
    gifname = "EM_M=%d_K=%d_%s" % (M,K,plot_type)
    os.system('convert @image_list.txt -delay {} {}.gif'.format(delay,gifname)) 
        # Use ImageMagick to convert list into gif
        # Check if ImageMagick is first on your computer, ex. !brew install imagemagick
        # On windows convert is 'magick'
        
    # How to change delays for a specific frame
    # 1. Clone the frame (frame 0 is first frame) and change delay, the cloned frame is inserted past the old frame
    # 2. Swap old frame with cloned frame, using - to indicate the previous frame.
    # 3. Delete the old frame which is now the next frame, using +.
    #os.system('convert {}.gif \( -clone 0 -set delay 100 \) -swap 0 +delete '.format(gifname) + \
    #                         '\( +clone -set delay 100 \) +swap +delete {}.gif'.format(gifname))
    # We only want to delay the last frame
    # +clone will make a copy of the last image in the sequence
    # +swap will switch the last two images in the sequence
    os.system('convert {}.gif \( +clone -set delay 100 \) +swap +delete {}.gif'.format(gifname,gifname))    
    
    
# Cleanup files used to generate animation
# Remove text file, all individual PNGs and their containing folders
def cleanup():
    
    os.remove('image_list.txt')
    
    # I'm kinda antsy about using shutil because it's too powerful
    # So lose access to it RIGHT AFTER deleting desired files
    # Could alternatively use glob to search through files then os.remove or os.rmdir one by one
    import shutil
    shutil.rmtree('anim')
    del shutil
In [7]:
### ~~~~~ Runtime ~~~~~ ###

#X = npm.ones((5,3))
#(X_circles,Y_circles,X,Y_unimodal,X_spiral,Y_spiral,X_bimodal,Y_bimodal) = getSpecialData()
#(X,Y) = getKeeneData(50,50)
#EM(X,K=2)

# Delay between each frame of animation
# Why is this so fast???
delay = 30

print '----------------------------------------------------------'
print ('1-Feature Dataset')
print '----------------------------------------------------------\n'

K=2
print '----------------------------------------------------------'
print ('K = %d' % (K))
print '----------------------------------------------------------\n'
size = 50
X = generate1DClusters(K,[size,size])
M = X.shape[1]
(gamma,mu,cov,pi,log_likelihood,frames) = EM(X,K=K,histogram_scale=size)
generateAnimation(M,K,delay)
print

K=3
print '----------------------------------------------------------'
print ('K = %d' % (K))
print '----------------------------------------------------------\n'
size = 20
X = generate1DClusters(K,[size,size,size])
M = X.shape[1]
(gamma,mu,cov,pi,log_likelihood,frames) = EM(X,K=K,histogram_scale=size)
generateAnimation(M,K,delay)
print

K=5
print '----------------------------------------------------------'
print ('K = %d' % (K))
print '----------------------------------------------------------\n'
size = 20
X = generate1DClusters(K,[size,size,size,size,size])
M = X.shape[1]
(gamma,mu,cov,pi,log_likelihood,frames) = EM(X,K=K,histogram_scale=size)
generateAnimation(M,K,delay)
print 
----------------------------------------------------------
1-Feature Dataset
----------------------------------------------------------

----------------------------------------------------------
K = 2
----------------------------------------------------------

N: 100 M: 1

Before EM:
Directory anim/M=1/K=2 already exists, clearing files...
Iteration 1:
Log likelihood: -15576.4856722
Iteration 2:
Log likelihood: -242.775420986
Iteration 3:
Log likelihood: -219.72043443
Iteration 4:
Log likelihood: -219.72043443
K-Means converged on iteration 4, log likelihood converged (old=-219.72043,new=-219.72043,diff=0.00000).

----------------------------------------------------------
K = 3
----------------------------------------------------------

N: 60 M: 1

Before EM:
Directory anim/M=1/K=3 already exists, clearing files...
Iteration 1:
Log likelihood: -7275.60219367
Iteration 2:
Log likelihood: -272.473454932
Iteration 3:
Log likelihood: -245.276002634
Iteration 4:
Log likelihood: -214.722656449
Iteration 5:
Log likelihood: -158.574786092
Iteration 6:
Log likelihood: -158.357144341
Iteration 7:
Log likelihood: -158.357144341
K-Means converged on iteration 7, log likelihood converged (old=-158.35714,new=-158.35714,diff=0.00000).

----------------------------------------------------------
K = 5
----------------------------------------------------------

N: 100 M: 1

Before EM:
Directory anim/M=1/K=5 already exists, clearing files...
Iteration 1:
Log likelihood: -8071.62668071
Iteration 2:
Log likelihood: -467.644649746
Iteration 3:
Log likelihood: -467.210938758
Iteration 4:
Log likelihood: -466.985711221
Iteration 5:
Log likelihood: -466.776293431
Iteration 6:
Log likelihood: -466.555613422
Iteration 7:
Log likelihood: -466.329099013
Iteration 8:
Log likelihood: -466.105841792
Iteration 9:
Log likelihood: -465.88527435
Iteration 10:
Log likelihood: -465.652888417
Iteration 11:
Log likelihood: -465.386014162
Iteration 12:
Log likelihood: -465.062588775
Iteration 13:
Log likelihood: -464.672488549
Iteration 14:
Log likelihood: -464.23798044
Iteration 15:
Log likelihood: -463.82623242
Iteration 16:
Log likelihood: -463.502062452
Iteration 17:
Log likelihood: -463.264337806
Iteration 18:
Log likelihood: -463.070276829
Iteration 19:
Log likelihood: -462.873893375
Iteration 20:
Log likelihood: -462.620097261
Iteration 21:
Log likelihood: -462.198676075
Iteration 22:
Log likelihood: -461.264684277
Iteration 23:
Log likelihood: -458.242873546
Iteration 24:
Log likelihood: -445.136975203
Iteration 25:
Log likelihood: -432.06618364
Iteration 26:
Log likelihood: -431.682982753
Iteration 27:
Log likelihood: -431.523887073
Iteration 28:
Log likelihood: -431.450860482
Iteration 29:
Log likelihood: -431.409958897
Iteration 30:
Log likelihood: -431.375567836
Iteration 31:
Log likelihood: -431.33428469
Iteration 32:
Log likelihood: -431.275011441
Iteration 33:
Log likelihood: -431.180018411
Iteration 34:
Log likelihood: -431.008143466
Iteration 35:
Log likelihood: -430.636517842
Iteration 36:
Log likelihood: -429.568564071
Iteration 37:
Log likelihood: -424.955317513
Iteration 38:
Log likelihood: -404.738914677
Iteration 39:
Log likelihood: -397.178995692
Iteration 40:
Log likelihood: -379.020645902
Iteration 41:
Log likelihood: -370.392293727
Iteration 42:
Log likelihood: -366.778435401
Iteration 43:
Log likelihood: -353.051405999
Iteration 44:
Log likelihood: -334.182476742
Iteration 45:
Log likelihood: -317.832982159
Iteration 46:
Log likelihood: -295.97093055
Iteration 47:
Log likelihood: -295.970930366
K-Means converged on iteration 47, log likelihood converged (old=-295.97093,new=-295.97093,diff=0.00000).

In [8]:
### ~~~~~ Runtime (cont'd) ~~~~~ ###

print '----------------------------------------------------------'
print ('2-Feature Datasets')
print '----------------------------------------------------------\n'

K=2
print '----------------------------------------------------------'
print ('K = %d' % (K))
print '----------------------------------------------------------\n'
X = generateClusters(K,[50,50,50,50,50],r=5)
M = X.shape[1]
(gamma,mu,cov,pi,log_likelihood,frames) = EM(X,K=K)
generateAnimation(M,K,delay)
print

K=3
print '----------------------------------------------------------'
print ('K = %d' % (K))
print '----------------------------------------------------------\n'
X = generateClusters(K,[50,50,50,50,50],r=5)
M = X.shape[1]
(gamma,mu,cov,pi,log_likelihood,frames) = EM(X,K=K)
generateAnimation(M,K,delay)
print

K=5
print '----------------------------------------------------------'
print ('K = %d' % (K))
print '----------------------------------------------------------\n'
X = generateClusters(K,[50,50,50,50,50],r=5)
M = X.shape[1]
(gamma,mu,cov,pi,log_likelihood,frames) = EM(X,K=K)
generateAnimation(M,K,delay)
print
----------------------------------------------------------
2-Feature Datasets
----------------------------------------------------------

----------------------------------------------------------
K = 2
----------------------------------------------------------

N: 100 M: 2

Before EM:
Directory anim/M=2/K=2 already exists, clearing files...
Iteration 1:
Log likelihood: -1527.45928762
Iteration 2:
Log likelihood: -446.148894747
Iteration 3:
Log likelihood: -446.121208339
Iteration 4:
Log likelihood: -446.087142683
Iteration 5:
Log likelihood: -446.04419156
Iteration 6:
Log likelihood: -445.989829154
Iteration 7:
Log likelihood: -445.920306966
Iteration 8:
Log likelihood: -445.828231019
Iteration 9:
Log likelihood: -445.697324209
Iteration 10:
Log likelihood: -445.490267144
Iteration 11:
Log likelihood: -445.116046171
Iteration 12:
Log likelihood: -444.325195103
Iteration 13:
Log likelihood: -442.298656839
Iteration 14:
Log likelihood: -435.639437516
Iteration 15:
Log likelihood: -408.900238847
Iteration 16:
Log likelihood: -354.395644618
Iteration 17:
Log likelihood: -352.503454384
Iteration 18:
Log likelihood: -352.503454384
K-Means converged on iteration 18, log likelihood converged (old=-352.50345,new=-352.50345,diff=0.00000).

----------------------------------------------------------
K = 3
----------------------------------------------------------

N: 150 M: 2

Before EM:
Directory anim/M=2/K=3 already exists, clearing files...
Iteration 1:
Log likelihood: -2270.05936415
Iteration 2:
Log likelihood: -766.185961009
Iteration 3:
Log likelihood: -709.22968203
Iteration 4:
Log likelihood: -687.210949704
Iteration 5:
Log likelihood: -665.662213216
Iteration 6:
Log likelihood: -661.990494425
Iteration 7:
Log likelihood: -659.761491217
Iteration 8:
Log likelihood: -656.691905177
Iteration 9:
Log likelihood: -648.575615417
Iteration 10:
Log likelihood: -632.14864073
Iteration 11:
Log likelihood: -622.366769517
Iteration 12:
Log likelihood: -613.130175164
Iteration 13:
Log likelihood: -593.920768663
Iteration 14:
Log likelihood: -574.630548976
Iteration 15:
Log likelihood: -574.112386018
Iteration 16:
Log likelihood: -574.112386018
K-Means converged on iteration 16, log likelihood converged (old=-574.11239,new=-574.11239,diff=0.00000).

----------------------------------------------------------
K = 5
----------------------------------------------------------

N: 250 M: 2

Before EM:
Directory anim/M=2/K=5 already exists, clearing files...
Iteration 1:
Log likelihood: -3696.72561548
Iteration 2:
Log likelihood: -1329.1535457
Iteration 3:
Log likelihood: -1293.94182593
Iteration 4:
Log likelihood: -1233.446042
Iteration 5:
Log likelihood: -1207.2888741
Iteration 6:
Log likelihood: -1194.11337539
Iteration 7:
Log likelihood: -1186.37855043
Iteration 8:
Log likelihood: -1178.88461359
Iteration 9:
Log likelihood: -1173.17933588
Iteration 10:
Log likelihood: -1171.88557234
Iteration 11:
Log likelihood: -1170.54566312
Iteration 12:
Log likelihood: -1167.54582856
Iteration 13:
Log likelihood: -1159.10870449
Iteration 14:
Log likelihood: -1153.53797394
Iteration 15:
Log likelihood: -1150.34638456
Iteration 16:
Log likelihood: -1148.41215184
Iteration 17:
Log likelihood: -1147.05969331
Iteration 18:
Log likelihood: -1145.94182646
Iteration 19:
Log likelihood: -1145.03740985
Iteration 20:
Log likelihood: -1144.32857806
Iteration 21:
Log likelihood: -1143.74263808
Iteration 22:
Log likelihood: -1143.1751378
Iteration 23:
Log likelihood: -1142.50192015
Iteration 24:
Log likelihood: -1141.57092436
Iteration 25:
Log likelihood: -1140.19972113
Iteration 26:
Log likelihood: -1138.22465024
Iteration 27:
Log likelihood: -1135.61945645
Iteration 28:
Log likelihood: -1132.51673209
Iteration 29:
Log likelihood: -1128.53926293
Iteration 30:
Log likelihood: -1123.4065741
Iteration 31:
Log likelihood: -1121.1137372
Iteration 32:
Log likelihood: -1119.40916575
Iteration 33:
Log likelihood: -1117.52289565
Iteration 34:
Log likelihood: -1114.80619941
Iteration 35:
Log likelihood: -1110.49848121
Iteration 36:
Log likelihood: -1106.10543668
Iteration 37:
Log likelihood: -1104.42058925
Iteration 38:
Log likelihood: -1103.51073
Iteration 39:
Log likelihood: -1102.81426168
Iteration 40:
Log likelihood: -1102.19892224
Iteration 41:
Log likelihood: -1101.56127688
Iteration 42:
Log likelihood: -1100.78361474
Iteration 43:
Log likelihood: -1099.7096901
Iteration 44:
Log likelihood: -1098.16312427
Iteration 45:
Log likelihood: -1096.11781695
Iteration 46:
Log likelihood: -1093.87067838
Iteration 47:
Log likelihood: -1091.55124134
Iteration 48:
Log likelihood: -1088.87531937
Iteration 49:
Log likelihood: -1085.27903254
Iteration 50:
Log likelihood: -1079.79191194
Iteration 51:
Log likelihood: -1071.2755109
Iteration 52:
Log likelihood: -1061.64637762
Iteration 53:
Log likelihood: -1056.12269584
Iteration 54:
Log likelihood: -1055.94870344
Iteration 55:
Log likelihood: -1055.94737607
Iteration 56:
Log likelihood: -1055.94726635
K-Means converged on iteration 56, log likelihood converged (old=-1055.94738,new=-1055.94727,diff=0.00011).

In [11]:
### ~~~~~ Runtime (cont'd) ~~~~~ ###

K=7
print '----------------------------------------------------------'
print ('K = %d' % (K))
print '----------------------------------------------------------\n'
size = 100
X = generateClusters(K,[size,size,size,size,size,size,size],r=7)
M = X.shape[1]
(gamma,mu,cov,pi,log_likelihood,frames) = EM(X,K=K)
generateAnimation(M,K,delay)
print
----------------------------------------------------------
K = 7
----------------------------------------------------------

N: 700 M: 2

Before EM:
Directory anim/M=2/K=7 already exists, clearing files...
Iteration 1:
Log likelihood: -18303.9546329
Iteration 2:
Log likelihood: -3986.71814706
Iteration 3:
Log likelihood: -3834.80829741
Iteration 4:
Log likelihood: -3743.62015881
Iteration 5:
Log likelihood: -3692.12440115
Iteration 6:
Log likelihood: -3663.20051052
Iteration 7:
Log likelihood: -3651.9084929
Iteration 8:
Log likelihood: -3640.59446804
Iteration 9:
Log likelihood: -3629.65593006
Iteration 10:
Log likelihood: -3619.27259912
Iteration 11:
Log likelihood: -3607.82663942
Iteration 12:
Log likelihood: -3594.79091171
Iteration 13:
Log likelihood: -3585.93420297
Iteration 14:
Log likelihood: -3579.09852595
Iteration 15:
Log likelihood: -3571.19122394
Iteration 16:
Log likelihood: -3560.96409271
Iteration 17:
Log likelihood: -3550.48032658
Iteration 18:
Log likelihood: -3544.2810095
Iteration 19:
Log likelihood: -3540.44742637
Iteration 20:
Log likelihood: -3534.8159838
Iteration 21:
Log likelihood: -3524.92495234
Iteration 22:
Log likelihood: -3511.8609328
Iteration 23:
Log likelihood: -3500.15199766
Iteration 24:
Log likelihood: -3490.86708694
Iteration 25:
Log likelihood: -3483.14329615
Iteration 26:
Log likelihood: -3475.74996073
Iteration 27:
Log likelihood: -3467.49414807
Iteration 28:
Log likelihood: -3457.47382377
Iteration 29:
Log likelihood: -3444.69067733
Iteration 30:
Log likelihood: -3427.77655511
Iteration 31:
Log likelihood: -3411.27774326
Iteration 32:
Log likelihood: -3400.74840142
Iteration 33:
Log likelihood: -3393.44892016
Iteration 34:
Log likelihood: -3388.75775745
Iteration 35:
Log likelihood: -3383.15591029
Iteration 36:
Log likelihood: -3375.40554835
Iteration 37:
Log likelihood: -3364.62047101
Iteration 38:
Log likelihood: -3352.00861135
Iteration 39:
Log likelihood: -3345.19938254
Iteration 40:
Log likelihood: -3344.24160204
Iteration 41:
Log likelihood: -3344.11120495
Iteration 42:
Log likelihood: -3344.08571288
Iteration 43:
Log likelihood: -3344.08185789
Iteration 44:
Log likelihood: -3344.08135699
K-Means converged on iteration 44, log likelihood converged (old=-3344.08186,new=-3344.08136,diff=0.00050).

In [ ]:
# If there's an issue with the gif, use this to regenerate
delay = 200 # This should be 2 seconds but it's pretty fast... w/e
generateAnimation(1,3,delay)
generateAnimation(2,2,delay)
generateAnimation(2,3,delay)
generateAnimation(2,5,delay)
In [12]:
# Delete extraneous files after gif is generated
cleanup()
In [ ]: